To compare the performance of varKode to Skmer, we will use leave-one-out cross validation: we remove one sample from the dataset, train a varKode model or make a skmer reference with the remaining samples, and then use the sample left out as query. We then record whether or not we correctly identify this sample in varKoder, and whether or not the closest sample with Skmer has the same identification.
For traditional barcodes, we assembled the genome of each sample, and then used BLAST to search for each of the traditional barcode genes. We recorded if we could find this gene in the assembly, coding as missing data if we could not. We then recorded whether the best BLAST hit for a sample was the correct species.
rm(list=ls())
library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2 ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(future)
library(ggthemes)
library(patchwork)
library(cowplot)
Attaching package: ‘cowplot’
The following object is masked from ‘package:patchwork’:
align_plots
The following object is masked from ‘package:ggthemes’:
theme_map
The following object is masked from ‘package:lubridate’:
stamp
library(patchwork)
library(phytools)
Loading required package: ape
Attaching package: ‘ape’
The following object is masked from ‘package:dplyr’:
where
Loading required package: maps
Attaching package: ‘maps’
The following object is masked from ‘package:purrr’:
map
library(ape)
For VarKoder, we used leave-one-out cross-validation to test the accuracy for family, genera, species in the joint Malpighiaceae-Chrysobalanaceae dataset. We used as input data varKodes produced from kmers of size 7 and 500Kbp to 200Mbp of data, or all of the data available if less than 200 Mbp. For each sample, we built a model using as input data from all other samples. Then we queried the sample left out, using as input the images generated from 500Kb to the total data available. Now we will summarize the results.
In this test, we used varKoder v0.6.0. Let’s process the results.
read_and_process_xval = function(infolder){
plan(multisession(workers = 12))
varkoder_results = list.files(infolder,
'predictions.csv',
recursive=T,
full.names = T) %>%
furrr::future_map_dfr(~read_csv(.x) %>% mutate(sample_id = as.character(sample_id))) %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
plan(sequential)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels[str_detect(query_labels,'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels,'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(query_labels,'species')),
query_labels[str_detect(query_labels,'species')] %in% predicted_list,
NA
),
family_incorrect = any(!(predicted_list[str_detect(predicted_list,'family')] %in% query_labels[str_detect(query_labels,'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list,'genus')] %in% query_labels[str_detect(query_labels,'genus')])),
species_incorrect = ifelse(any(str_detect(query_labels,'species')),
any(!(predicted_list[str_detect(predicted_list,'species')] %in% query_labels[str_detect(query_labels,'species')])),
NA
)
)
return(varkoder_results)
}
summarize_results = function(res,level){
res = res %>%
ungroup() %>%
mutate(low_quality = str_detect(actual_labels,"low_quality:True"),
result = as.character(ifelse(res[,str_c(level,'correct',sep='_')] & !res[,str_c(level,'incorrect',sep='_')], 'correct',
ifelse(res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'ambiguous',
ifelse(!res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'incorrect',
'inconclusive'
))))
) %>%
filter(!is.na(result)) %>%
group_by(query_bp,result) %>%
summarise(N=n(), .groups = 'drop') %>%
group_by(query_bp) %>%
mutate(p= N/sum(N)) %>%
mutate(query_bp = as.integer(str_remove(query_bp,'K'))*1000) %>%
ungroup() %>%
mutate(query_bp = as.factor(query_bp)) %>%
complete(query_bp,result, fill = list(p = 0, N = 0)) %>%
mutate(query_bp = as.numeric(as.character(query_bp))) %>%
ungroup()
return(res)
}
plot_area = function(sum_df, title, relative = FALSE, grid = TRUE, xlim_all = TRUE, wrap){
breaks = c(500000,
1000000,
2000000,
5000000,
10000000,
20000000,
50000000,
100000000,
200000000
)
if (xlim_all){
xlimits = range(breaks)
} else {
xlimits = range(sum_df$query_bp)
}
sum_df = sum_df %>%
mutate(result = factor(result,ordered = T, levels = c('correct','ambiguous','inconclusive','incorrect')))
if (relative){
ylimits = c(0,1)
} else {
ylimits = c(0,sum_df %>% group_by(query_bp) %>% summarize(N=sum(N)) %>% pull(N) %>% max)
}
# Get colors from a Color Brewer palette
brewer_colors <- RColorBrewer::brewer.pal(4, "Accent")
if (relative) {
p1 = ggplot(sum_df, aes(x=query_bp,y=p,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Fraction of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
} else {
p1 = ggplot(sum_df, aes(x=query_bp,y=N,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
}
if (grid){
p1 = p1 +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
p1 = p1 + coord_cartesian(xlim=xlimits, ylim=ylimits,expand = FALSE)
if (!missing(wrap)) {
p1 = p1 + facet_wrap(as.formula(wrap))
}
return(p1)
}
Now let’s plot genus-level accuracy for a model taking quality labels into account:
results = read_and_process_xval('Malpighiaceae+Chrysobalanaceae/varKoder/vit_results/')
summary_genus = summarize_results(results,'genus')
p_genus = plot_area(summary_genus, 'varKoder genus', relative = TRUE)
p_genus
Now the same but with species
summary_species = summarize_results(results,'species')
p_species = plot_area(summary_species, 'varKoder species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_species
Finally, family
summary_family = summarize_results(results,'family')
p_family = plot_area(summary_family, 'varKoder family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_family
Now we will try to identify which samples failed and why they failed. Particuarly, how do DNA quality, amount of data, and the number of samples per class impact results? We will use genus-level predictions to test.
genus_predictions = results %>%
mutate(predicted_genus = str_extract(predicted_labels, 'genus:[^;]*'),
actual_genus = str_extract(actual_labels, 'genus:[^;]*')) %>%
select(-starts_with('family'),-starts_with('species')) %>%
pivot_longer(cols = starts_with("genus"), names_to = "predicted_label", values_to = "confidence") %>%
filter(actual_genus == predicted_label) %>%
select(query_bp, sample_id, basefrequency_sd, actual_genus, confidence) %>%
mutate(query_bp = 1000*(str_remove(query_bp, "K") %>% as.integer))
genus_predictions = genus_predictions %>%
select(sample_id, actual_genus) %>%
distinct() %>%
group_by(actual_genus) %>%
summarise(N_samples = n()) %>%
right_join(genus_predictions)
Joining with `by = join_by(actual_genus)`
genus_predictions
Now let’s make some plots. First, what is the effect of number of samples per class in confidence?
plot_genus_N_vs_conf = ggplot(genus_predictions, aes(x = N_samples-1,
y = confidence)) +
scale_color_viridis_c() +
geom_jitter(alpha=0.3) +
scale_x_log10() +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct prediction') +
xlab('Number of samples in correct genus\n(log scale)') +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_N_vs_conf
Now, what is the effect of sample quality in confidence?
plot_genus_freqsd_vs_conf = ggplot(genus_predictions, aes(x = basefrequency_sd, y = confidence)) +
geom_point(alpha=0.3) +
scale_x_log10() +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct prediction') +
xlab('Standard deviation of base frequencies') +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_freqsd_vs_conf
Now, what is the effect of amount of data in confidence?
plot_genus_bp_vs_conf = ggplot(genus_predictions, aes(x = query_bp, y = confidence)) +
geom_jitter(alpha=0.3) +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct prediction') +
xlab('Base pairs in query images\n(log scale)') +
scale_x_log10() +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_bp_vs_conf
Now let’s save the three of them as a single plot using cowplot.
combined_conf = patchwork::wrap_plots(plot_genus_N_vs_conf + theme(text = element_text(size=8)),
plot_genus_bp_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8)),
plot_genus_freqsd_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8))) +
patchwork::plot_annotation(tag_levels = 'A')
combined_conf
ggsave(filename = 'images_manuscript/supp_conf_predictors.pdf',device = 'pdf',width = 7,height=3,units = 'in',useDingbats=F)
Let’s put it all together now in a linear model:
lm_data = genus_predictions %>%
mutate(confidence = ifelse(confidence == 1, confidence-0.0000001, confidence),
confidence = car::logit(confidence)) %>%
mutate(query_bp = (query_bp - mean(query_bp))/sd(query_bp),
basefrequency_sd = (basefrequency_sd - mean(basefrequency_sd))/sd(basefrequency_sd),
N_samples = (N_samples - mean(N_samples))/sd(N_samples)
)
full_model = lm(formula = confidence~query_bp*basefrequency_sd*N_samples, data = lm_data)
full_model
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Coefficients:
(Intercept) query_bp basefrequency_sd
4.92334 0.16749 -0.58514
N_samples query_bp:basefrequency_sd query_bp:N_samples
1.67950 0.24651 0.01799
basefrequency_sd:N_samples query_bp:basefrequency_sd:N_samples
0.01540 0.02975
summary(full_model)
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Residuals:
Min 1Q Median 3Q Max
-16.1052 -1.1535 0.2901 1.4519 5.8115
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.92334 0.05383 91.468 < 2e-16 ***
query_bp 0.16749 0.07211 2.323 0.0203 *
basefrequency_sd -0.58514 0.10373 -5.641 1.9e-08 ***
N_samples 1.67950 0.05676 29.589 < 2e-16 ***
query_bp:basefrequency_sd 0.24651 0.18896 1.305 0.1922
query_bp:N_samples 0.01799 0.07770 0.232 0.8169
basefrequency_sd:N_samples 0.01540 0.12390 0.124 0.9011
query_bp:basefrequency_sd:N_samples 0.02975 0.22380 0.133 0.8943
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.217 on 2251 degrees of freedom
Multiple R-squared: 0.4317, Adjusted R-squared: 0.4299
F-statistic: 244.3 on 7 and 2251 DF, p-value: < 2.2e-16
plot(full_model)
reduced_model = step(full_model, direction ="both")
Start: AIC=3604.28
confidence ~ query_bp * basefrequency_sd * N_samples
Df Sum of Sq RSS AIC
- query_bp:basefrequency_sd:N_samples 1 0.086829 11060 3602.3
<none> 11060 3604.3
Step: AIC=3602.29
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd +
query_bp:N_samples + basefrequency_sd:N_samples
Df Sum of Sq RSS AIC
- basefrequency_sd:N_samples 1 0.0044 11060 3600.3
- query_bp:N_samples 1 0.2090 11060 3600.3
<none> 11060 3602.3
- query_bp:basefrequency_sd 1 14.9441 11075 3603.3
+ query_bp:basefrequency_sd:N_samples 1 0.0868 11060 3604.3
Step: AIC=3600.29
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd +
query_bp:N_samples
Df Sum of Sq RSS AIC
- query_bp:N_samples 1 0.2053 11060 3598.3
<none> 11060 3600.3
- query_bp:basefrequency_sd 1 15.0305 11075 3601.4
+ basefrequency_sd:N_samples 1 0.0044 11060 3602.3
Step: AIC=3598.34
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd
Df Sum of Sq RSS AIC
<none> 11060 3598.3
- query_bp:basefrequency_sd 1 14.8 11075 3599.4
+ query_bp:N_samples 1 0.2 11060 3600.3
+ basefrequency_sd:N_samples 1 0.0 11060 3600.3
- N_samples 1 5986.7 17047 4573.6
reduced_model
Call:
lm(formula = confidence ~ query_bp + basefrequency_sd + N_samples +
query_bp:basefrequency_sd, data = lm_data)
Coefficients:
(Intercept) query_bp basefrequency_sd N_samples
4.9219 0.1652 -0.5959 1.6755
query_bp:basefrequency_sd
0.2246
summary(reduced_model)
Call:
lm(formula = confidence ~ query_bp + basefrequency_sd + N_samples +
query_bp:basefrequency_sd, data = lm_data)
Residuals:
Min 1Q Median 3Q Max
-16.1127 -1.1572 0.2955 1.4482 5.8153
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.92189 0.04832 101.855 < 2e-16 ***
query_bp 0.16518 0.05680 2.908 0.00367 **
basefrequency_sd -0.59594 0.06579 -9.058 < 2e-16 ***
N_samples 1.67552 0.04797 34.929 < 2e-16 ***
query_bp:basefrequency_sd 0.22460 0.12921 1.738 0.08230 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.215 on 2254 degrees of freedom
Multiple R-squared: 0.4317, Adjusted R-squared: 0.4307
F-statistic: 428 on 4 and 2254 DF, p-value: < 2.2e-16
plot(reduced_model)
For skmer, we left each sample out, built a reference and then queried that sample. We have several files in which reference samples are ordered by their distance to the query, we here we will evaluate whether the closest sample is from the correct species or genus.
Because it is not clear how skmer behaves for different levels of coverage, we repeated this for several input sizes (in number of basepairs) as query, but always used the maximum input dize available (up to 200Mb) for references.
Let’s make a function that extracts these results as a table.
samp_labels = results %>% select(sample_id,actual_labels) %>% distinct()
extract_skmer_results = function(file_path) {
# Read only the first 2 lines of the file
file_lines <- readLines(file_path, n = 2)
# Extract sample_ID, basepairs from the first line
sample_info <- str_match(file_lines[1], "\\s*(.*?)@(\\d+K)")[, 2:3]
sample_ID <- sample_info[1]
basepairs <- sample_info[2]
# Extract reference_sample_ID, distance from the second line
reference_info <- str_match(file_lines[2], "\\s*(.*?)@.*\\s+(\\d+\\.\\d+)")[, 2:3]
reference_sample_ID <- reference_info[1]
distance <- as.numeric(reference_info[2])
# Create a tibble
tibble(
sample_id = sample_ID,
query_bp = basepairs,
closest_reference_sample_id = reference_sample_ID,
closest_distance = distance
)
}
Now we will apply this function to all skmer output files.
plan(multisession(workers = 12))
skmer_results_df = furrr::future_map_dfr(
list.files('Malpighiaceae+Chrysobalanaceae/skmer/skmer_xval_results/', full.names = T),
~ extract_skmer_results(.x)
) %>%
left_join(samp_labels, by = 'sample_id') %>%
left_join(
samp_labels %>% select(
closest_reference_sample_id = 'sample_id',
predicted_labels = actual_labels
),
by = 'closest_reference_sample_id'
) %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
)
plan(sequential)
skmer_results_df
Now let’s summarize and plot by genus:
skmer_summary_genus = summarize_results(skmer_results_df,'genus')
p_skmer_genus = plot_area(skmer_summary_genus, 'Skmer genus', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_genus
Now by species. In Skmer, there is no inconclusive result: if there is no correct species prediction, it means that a sample was predicted in the wrong genus and therefore it is incorrect
skmer_summary_species = summarize_results(skmer_results_df,'species') %>%
mutate(result = ifelse(result == 'correct', 'correct','incorrect')) %>%
group_by(query_bp,result) %>%
summarise_all(sum)
p_skmer_species = plot_area(skmer_summary_species, 'Skmer species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_species
And now by family:
skmer_summary_family = summarize_results(skmer_results_df,'family')
skmer_summary_family
p_skmer_family = plot_area(skmer_summary_family, 'Skmer family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_family
Let’s now read the traditional barcode BLAST results and summarize them in the same way as skmer and varKoder. Let’s start by defining a fuction that reads the data so we can summarize it using the previously defined functions.
read_traditional_barcodes = function(bp) {
input_file = paste0(
'Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/',
bp,
'M_blast_phylo_sum_sp.tsv'
)
barcode_res = read_delim(input_file) %>%
pivot_longer(-sp, names_to = 'marker', values_to = 'closest_reference_sample_id') %>%
rename(sample_id = 'sp') %>%
mutate(
sample_id = str_remove_all(sample_id, '@.+'),
closest_reference_sample_id = str_remove_all(closest_reference_sample_id, '@.+'),
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(marker != 'Concatenated_phylogeny') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e3)
return(barcode_res)
}
Now we can apply this function to all of our results:
results_barcodes = purrr::map_dfr(c(10,20,50,100,200),read_traditional_barcodes)
Rows: 288 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 285 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 267 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 200 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 166 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
results_barcodes
Now let’s summarise for each marker separately:
barcode_summary_family = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'family'),.id='marker')
barcode_summary_family
barcode_summary_genus = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'genus'),.id='marker')
barcode_summary_genus
barcode_summary_species = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'species'),.id='marker')
barcode_summary_species
Now let’s plot, making separate plots for each marker:
Species:
p_barcode_species = barcode_summary_species %>%
split(barcode_summary_species$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' species'), relative = TRUE, xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_species
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Genera:
p_barcode_genus = barcode_summary_genus %>%
split(barcode_summary_genus$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' genus'), relative = TRUE, xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_genus
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Family:
p_barcode_family = barcode_summary_family %>%
split(barcode_summary_family$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' family'), relative = TRUE,xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_family
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Now we will do the same for concatenated tree. Let’s start by defining a function to gather results. We will consider a result as correct if the majority of the sister taxon to a tip has the same label.
read_concatenated_tree_results = function(bp){
# Read in your tree - replace 'your_tree_file.nwk' with the path to your tree file
tree = read.tree(paste0('Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/conc.',bp,'m.spname.tre'))
#leave only sample IDs as tip labels
tree$tip.label = tree$tip.label %>% str_remove(".*@") %>% str_remove("'") %>% str_replace(' ref','_ref')
# Compute the patristic distances and list all reference names
patristic_distances <- cophenetic(tree)
all_ref_names = dimnames(patristic_distances)[[1]][str_detect(dimnames(patristic_distances)[[1]],'_ref$')]
all_nonref = dimnames(patristic_distances)[[1]][str_detect(dimnames(patristic_distances)[[1]],'_ref$',negate = TRUE)]
# For each tip, find the reference sample with closest patristic distance
find_closest = function(tip){
to_keep = c(tip,all_ref_names[str_detect(all_ref_names,paste0(tip,'_ref'),negate = TRUE)])
return(names(sort(patristic_distances[tip,to_keep])[2]) %>%
str_remove('_ref'))
}
closest_match = purrr::map_chr(all_nonref,find_closest)
samples_with_data = read_delim(paste0('Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/',bp,'M_blast_phylo_sum_sp.tsv')) %>%
select(sample_id=sp) %>%
mutate(sample_id = str_remove_all(sample_id, '@.+'))
barcode_res = tibble(sample_id = all_nonref,
closest_reference_sample_id = closest_match) %>%
right_join(samples_with_data) %>%
mutate(
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(sample_id!='2095') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e3)
return(barcode_res)
}
Now let’s apply this function
results_concat_barcodes = purrr::map_dfr(c(10,20,50,100,200),read_concatenated_tree_results)
Rows: 288 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 285 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 267 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 200 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 166 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`
results_concat_barcodes
Let’s summarize results and plot for genus, species and family accuracy
concat_summary_species = summarize_results(results_concat_barcodes,'species')
p_concat_species = plot_area(concat_summary_species, relative = FALSE,title = 'Concatenated barcodes species',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_species
concat_summary_genus = summarize_results(results_concat_barcodes,'genus')
p_concat_genus = plot_area(concat_summary_genus, relative = TRUE,title = 'Concatenated barcodes genus',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_genus
concat_summary_family = summarize_results(results_concat_barcodes,'family')
p_concat_family = plot_area(concat_summary_family, relative = TRUE,title = 'Concatenated barcodes family',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_family
Now let’s compare methods side by side. For genus level:
p = patchwork::wrap_plots(p_genus + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_genus + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_genus$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_genus$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_genus,
ncol = 1) +
plot_annotation(title = 'Genus-level accuracy')
p
ggsave('images_manuscript/fig3_genus_accuracy.pdf', width=5,height = 10)
ggsave('images_manuscript/fig3_genus_accuracy.png', width=5,height = 10,dpi=1200)
Now for species level:
p = patchwork::wrap_plots(p_species + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_species + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_species$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_species$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_species,
ncol = 1) +
plot_annotation(title = 'species-level accuracy')
p
ggsave('images_manuscript/fig3_species_accuracy.pdf', width=5,height = 10)
ggsave('images_manuscript/fig3_species_accuracy.png', width=5,height = 10,dpi=1200)
Now for family level:
p = patchwork::wrap_plots(p_family + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_family + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_family$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_family$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_family,
ncol = 1) +
plot_annotation(title = 'family-level accuracy')
p
ggsave('images_manuscript/fig3_family_accuracy.pdf', width=5,height = 10)
ggsave('images_manuscript/fig3_family_accuracy.png', width=5,height = 10,dpi=1200)
Now let’s compare the time to produce references and to produce
Finally, let’s summarize results for the whole SRA dataset. In this case, we only have varKoder since Skmer cannot finish and traditional barcodes are inapplicable.
varKoder_SRA_results = read_csv('all_SRA/varkoder_query_results/predictions.csv') %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
New names:Rows: 8607 Columns: 873── Column specification ──────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_basepairs, query_kmer_len, trained_model_path, prediction_...
dbl (865): ...1, prediction_threshold, actual_labels, basefrequency_sd, 10066, 101087, 10135, 10167, 10193...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
SRA_taxlabels = str_remove(varKoder_SRA_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varKoder_SRA_results = varKoder_SRA_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels)))) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results
NA
Now let’s summarize and plot:
SRA_summary_family = summarize_results(varKoder_SRA_results,'family')
SRA_summary_family
N_samp = SRA_summary_family %>%
group_by(query_bp) %>%
summarise(N = sum(N))
p_SRA_family = plot_area(SRA_summary_family, 'varKoder SRA family', relative = TRUE,xlim_all = FALSE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_SRA_family
Let’s now do the SRA plot, but splitting by kingdom. First, we need to retrieve kingdom information:
p_SRA_families = read_csv('all_SRA/runs_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom'
) %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = T)) %>%
plot_area(relative=FALSE,xlim_all = FALSE,wrap = '~Kingdom',title='Families in SRA') + coord_cartesian(xlim=c(500,10000)*1000,expand = FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ──────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, LibraryStrategy, LibrarySelection, L...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, InsertDev, Study_Pubmed_id, Pro...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Affection_Status, Analyte_Type, ...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which will replace the existing one.
print(p_SRA_families)
ggsave('images_manuscript/fig3_SRA_accuracy.pdf', width=4.5,height = 4)
ggsave('images_manuscript/fig3_SRA_accuracy.png', width=4.5,height = 4,dpi = 1200)
Now we will make a small figure to include the additional datasets in which we applied varKoding.
In these cases, we chose a test set that included both taxa in the
training set and taxa not in the training set, so we will graph both
separately. This is denoted by a column named
in_training_model. Let’s start by reading results.
Let’s define a function to read and process predictions:
read_and_process_others = function(infile){
varkoder_results = read_csv(infile) %>%
mutate(sample_id = as.character(sample_id)) %>%
select(-1) %>%
rename(query_bp = query_basepairs)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(taxon_correct = any(query_labels %in% predicted_list),
taxon_incorrect = any(!(predicted_list[!is.na(predicted_list)] %in% query_labels))
)
return(varkoder_results)
}
Now let’s apply this function to all files.
prediction_files = list.files('other_datasets',pattern = 'prediction.+csv',full.names = T)
names(prediction_files) = basename(prediction_files) %>% str_extract(".*(?=_prediction_table\\.csv)")
other_results = purrr::map_dfr(prediction_files, read_and_process_others, .id='dataset')
Rows: 18 Columns: 16── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, ...
dbl (8): Bembidion, a, basefrequency_sd, ampliatum, breve, lividulum, saturatum, testatum
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 18 Columns: 16── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, ...
dbl (8): Corallorhiza, prediction_threshold, basefrequency_sd, Corallorhiza bentleyi, Corallorhiza striata,...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 25 Columns: 16── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, ...
dbl (8): Mycobacterium tuberculosis, prediction_threshold, basefrequency_sd, 1.2.2.1, 2.2.1.1.1, 3.1.2, 4.1...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 15 Columns: 16── Column specification ───────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_training_model, predicted_labels, ...
dbl (8): Xanthoparmelia, prediction_threshold, basefrequency_sd, camtschadalis, chlorochroa, coloradoensis,...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
other_results
Let’s now summarize by dataset and separately for taxa included and excluded from the training set.
summary_others = other_results %>%
split(interaction(other_results$dataset, other_results$in_training_model)) %>%
purrr::map_dfr(summarize_results, level = 'taxon', .id = 'comb') %>%
separate(comb, into = c("dataset", "taxon_in_training_raw"), sep = "\\.") %>%
mutate(taxon_in_training = taxon_in_training_raw == 'yes') %>%
select(-taxon_in_training_raw) %>%
mutate(taxon_in_training = c('Taxon not in training set', 'Taxon in training set')[taxon_in_training+1],
dataset = str_replace(dataset, "^(.)", ~toupper(.x))) %>%
mutate(result = factor(result,
levels=c("correct", "ambiguous", "inconclusive", "incorrect"),
ordered=T))
summary_others
NA
Now let’s plot
p_others = ggplot(summary_others , aes(x = dataset, y = N, fill = result)) +
geom_col()+
scale_fill_manual(values = setNames(RColorBrewer::brewer.pal(4, "Accent"), c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
ggtitle('Other datasets') +
ylab('Number of samples') +
theme_few() +
scale_y_continuous(minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
axis.title.x = element_blank(),
axis.text.x = element_text(face='italic'),
panel.ontop = TRUE) +
coord_cartesian(expand=FALSE) +
facet_grid(taxon_in_training~.)
p_others
ggsave('images_manuscript/fig3_others_accuracy.pdf', width=4.5,height = 4)
ggsave('images_manuscript/fig3_others_accuracy.png', width=4.5,height = 4,dpi = 1200)
Here we just query our results to get a few figures that we report in the paper.
Total number of samples used in cross-validation:
dim(samp_labels)
[1] 287 2
Number of Stigmaphyllon samples with each kind of error for varkoder:
summary_species
Number of Stigmaphyllon samples with each kind of error for skmer:
skmer_summary_species
Traditional barcode accuracy for species:
Concatenated barcode accuract for species:
varKoder accuracy for genera:
summary_genus
varKoder accuracy for family:
summary_family
Skmer accuracy for genera:
skmer_summary_genus
Skmer accuracy for family:
skmer_summary_family
Number of samples available for each genus and data amount
results %>%
mutate(genus = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(query_bp) %>%
summarize(N=n()) %>%
complete()
Plot number of samples for supplementary material.
n_samples_genera = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_genera
n_samples_species = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=species:)[^;]+")) %>%
filter(!is.na(taxon)) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_species
n_samples_SRA = varKoder_SRA_results %>%
mutate(taxon = as.character(actual_labels)) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_SRA
For SRA, we have to count both validation and training samples, since we did not do cross-validation. Let’s use image names to get the information and then the results table to figure out which ones were in the validation set.
((list.files('all_SRA/varkoder_images_SRA/',pattern='*.png',recursive = T) %>%
str_extract("^(.+)(?=@)"))%in%
varKoder_SRA_results$sample_id) %>% summary
Mode TRUE
logical 41103
plot_Nsamples_area = function(df, title){
df = df %>%
mutate(query_bp = parse_number(query_bp) *1000)
n_levels <- length(unique(df$taxon))
viridis_colors <- viridis::turbo(n_levels)
half_n <- ceiling(n_levels / 2)
reordered_colors <- c(rbind(viridis_colors[1:half_n], viridis_colors[(half_n + 1):n_levels]))
ggplot(df, aes(x=query_bp,y=N,fill=taxon, color = taxon, group = taxon)) +
geom_area(position= position_stack()) +
#geom_line(position='stack') +
scale_fill_manual(values = reordered_colors,
aesthetics = c('colour','fill'),
guide = 'none') +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),
breaks = 1000*parse_number(unique(n_samples_genera$query_bp)),
limits = 1000*range(parse_number(unique(n_samples_genera$query_bp)))) +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45),
panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
plot_Nsamples_area(n_samples_SRA,title='Eukaryotic familes') + facet_wrap(~validation_set)
Warning: number of columns of result is not a multiple of vector length (arg 2)
Total number of SRA samples. Validation:
read_csv('varKoder/all_SRA/varkoder_trained_model_ML/input_data.csv')[-1] %>%
group_by(is_valid) %>%
summarise(N = n())